# All libraries are present here:
knitr::opts_chunk$set(echo = TRUE)
library(readxl)
library(ggplot2)
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 4.0.5
library(moments)
library(Hotelling)
## Loading required package: corpcor
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::summarise() masks Hotelling::summarise()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Reading all the spreadsheets
bankruptcy = read_xlsx("bankruptcy.xlsx")
census = read_xlsx("census.xlsx")
skull = read_xlsx("skulldata.xlsx")
wordpar = read_xlsx("wordparity.xlsx")
Getting a sample
# Skull data work
samplevals = data.frame()
for(i in c(1:3)) {
setframe = skull[skull$Time == i, ]
setvals = sample.int(length(setframe$Time), 15)
sampleframe = setframe[setvals, ]
samplevals = rbind(samplevals, sampleframe)
}
samplevals
## # A tibble: 45 × 5
## MaxBr BasHt BasLength NasHt Time
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 126 133 102 51 1
## 2 128 132 93 53 1
## 3 136 143 100 54 1
## 4 125 136 93 48 1
## 5 132 136 100 50 1
## 6 126 129 109 51 1
## 7 138 135 100 55 1
## 8 124 138 101 46 1
## 9 141 140 100 51 1
## 10 130 130 104 49 1
## # … with 35 more rows
Finding the overall mean of the sample
overallmean = colMeans(samplevals[1:4])
overallmean
## MaxBr BasHt BasLength NasHt
## 132.86667 133.73333 97.51111 50.64444
Level Means
l1means = colMeans(samplevals[samplevals$Time == 1,1:4])
l2means = colMeans(samplevals[samplevals$Time == 2,1:4])
l3means = colMeans(samplevals[samplevals$Time == 3,1:4])
l1means
## MaxBr BasHt BasLength NasHt
## 130.93333 134.93333 99.26667 50.86667
l2means
## MaxBr BasHt BasLength NasHt
## 133.53333 133.46667 98.66667 50.20000
l3means
## MaxBr BasHt BasLength NasHt
## 134.13333 132.80000 94.60000 50.86667
Treatment sum of squares
treatment = (15 * (sum((overallmean - l1means)^2)))+(15 *(sum((overallmean - l2means) ^2)) )+(15 * (sum((overallmean - l3means)^2)))
treatment
## [1] 320.3556
Residual sum of squares
resid = ((sum((samplevals[samplevals$Time == 1,1:4] - l1means)^2))) + ((sum((samplevals[samplevals$Time == 2,1:4] - l2means) ^2))) + ((sum((samplevals[samplevals$Time == 3,1:4] - l3means)^2)))
resid
## [1] 406845.5
Total Sum of squares
total = ((sum((samplevals[samplevals$Time == 1,1:4] - overallmean)^2))) + ((sum((samplevals[samplevals$Time == 2,1:4] - overallmean) ^2))) + ((sum((samplevals[samplevals$Time == 3,1:4] - overallmean)^2)))
total
## [1] 406601.4
Manova Table:
titles = c("Treatment, B", "Residual, W", "Total")
SS = c(treatment, resid, total)
df = c(3, 36, 42)
data.frame(titles, SS, df)
## titles SS df
## 1 Treatment, B 320.3556 3
## 2 Residual, W 406845.4667 36
## 3 Total 406601.3778 42
Wilks lambda
wlambda = (resid) / (resid + treatment)
wlambda
## [1] 0.9992132
setvals2 = sample.int(length(wordpar$worddiff), 20)
samplevals2 = wordpar[setvals2, ]
samplevals2
## # A tibble: 20 × 4
## worddiff wordsame Arabicdiff Arabicsame
## <dbl> <dbl> <dbl> <dbl>
## 1 751 744 683 553
## 2 731 662 662. 624
## 3 1172. 896. 926 766
## 4 774 735 671 612
## 5 1096. 1009 1076 983
## 6 1044 909 865 839
## 7 814. 750. 711 625
## 8 1225 1179 1037 906.
## 9 995 875 678 659
## 10 976. 872. 814 735
## 11 656. 660. 572 539
## 12 869 860. 691 601
## 13 708 669 657 572.
## 14 767 738. 724 639
## 15 726 674 663 583
## 16 982 894 831 640
## 17 1056 930. 833 826
## 18 1126 954 888 728
## 19 747 752. 777 688.
## 20 1408. 1311 854 986
samplevals2$word = samplevals2$worddiff - samplevals2$wordsame
samplevals2$Arabic = samplevals2$Arabicsame - samplevals2$Arabicdiff
dbardiffvsame = c(((1/20) * sum(samplevals2$word)), ((1/20)*(sum(samplevals2$Arabic))))
dbardiffvsame
## [1] 77.45 -75.50
varianceword = sum((samplevals2$word - dbardiffvsame[1])^2) * (1 / (20 - 1))
covwordarab = sum((samplevals2$word - dbardiffvsame[1]) * (samplevals2$Arabic - dbardiffvsame[2])) * (1 / (20 -1))
variancearab = sum((samplevals2$Arabic - dbardiffvsame[2])^2) * (1 / (20 - 1))
varianceword
## [1] 4594.918
covwordarab
## [1] -478.0395
variancearab
## [1] 4820.632
Doing the rest of the calculation by hand, we have: (T^2) = n * (Dbar’) * (inv(S)) * (Dbar) = 40.2
F-value is:
(((19)*(2))/(18))*(qf(0.95, 2, 18))
## [1] 7.504065
By this, we know that the reactions for same vs different numbers are different, on average.
samplevals2$diff = samplevals2$worddiff - samplevals2$Arabicdiff
samplevals2$same = samplevals2$wordsame - samplevals2$Arabicsame
dbarwordarab = c(((1/20) * sum(samplevals2$diff)), ((1/20)*(sum(samplevals2$same))))
dbarwordarab
## [1] 150.525 148.575
variancediff = sum((samplevals2$diff - dbarwordarab[1])^2) * (1 / (20 - 1))
covdiffsame = sum((samplevals2$diff - dbarwordarab[1]) * (samplevals2$same - dbarwordarab[2])) * (1 / (20 -1))
variancesame = sum((samplevals2$same - dbarwordarab[2])^2) * (1 / (20 - 1))
variancediff
## [1] 16694.85
covdiffsame
## [1] 7691.445
variancesame
## [1] 7147.507
Doing the rest of the calculation by hand, we have: (T^2) = n * (Dbar’) * (inv(S)) * (Dbar) = 43.2
By this, we know that the reactions for written vs arabic numbers are also different, on average.
setvals3 = sample.int(length(census$totpop), 40)
samplevals3 = census[setvals3, ]
samplevals3
## # A tibble: 40 × 5
## totpop prof emp gov homeval
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6.03 3.1 76.0 25 1.08
## 2 1.64 16.7 64.6 49.4 3.13
## 3 3.38 2.17 66.1 22.6 1.44
## 4 4.36 1.67 65.4 23.7 1.07
## 5 5.54 9.25 74.9 31 2.23
## 6 5.44 2.93 73.6 22.3 1.65
## 7 5.48 1.34 77.4 21.6 1.32
## 8 3.58 3.38 65.6 26.1 1.31
## 9 5.34 3.41 72.6 20.1 1.66
## 10 6.29 2.6 64.3 27.4 1.78
## # … with 30 more rows
first the covariance matrix:
covmatcensus = cov(samplevals3)
covmatcensus
## totpop prof emp gov homeval
## totpop 3.19938045 -1.275331 1.7773157 -2.691683 -0.02120096
## prof -1.27533109 10.990055 -2.1098702 11.838561 1.49675673
## emp 1.77731571 -2.109870 51.8161892 -41.694099 -0.29842186
## gov -2.69168269 11.838561 -41.6940994 109.634455 1.86011859
## homeval -0.02120096 1.496757 -0.2984219 1.860119 0.34017122
and now the eigenvalues + vectors:
decmp = eigen(covmatcensus)
decmp
## eigen() decomposition
## $values
## [1] 132.656873 30.638647 9.592309 2.967553 0.124869
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.02558977 -0.006026141 0.1524519 0.986522961 -0.0532891113
## [2,] -0.09433218 -0.171562548 -0.9602729 0.142290487 -0.1389143399
## [3,] 0.45856771 -0.881475104 0.1075368 -0.033945944 -0.0008957122
## [4,] -0.88314857 -0.439108416 0.1648970 -0.005441539 -0.0034302049
## [5,] -0.01452079 -0.026747284 -0.1260131 0.073102171 0.9888632404
Based on the eigenvalues present, I would want to have 3 principle components present.
a1x = c(0,-0.955, 0.0241, -0.157, 0.0442, -0.0539)
y1 = sum(a1x^2)
y1
## [1] 0.9421137
a2x = c(0, 0.299, 0.941, 0.0554, -0.185, -0.155)
y2 = sum(a2x^2)
y2
## [1] 1.036201
a3x = c(0,-0.0784, 0.0639, 0.791, 0.586, -0.144)
y3 = sum(a3x^2)
y3
## [1] 1.000043
I can’t see anything in the slides pertaining to a confidence region for this question. In this case, I will be skipping it and moving ahead to other questions.
Calculating by hand, I got: L = [32.5 51.6 31.1 134 0.639]
L * L’ = [1060 1680 1010 4370 20.8 1680 2660 1060 6940 33 1010 1600 966 4180 19.9 4370 6940 4180 18100 85.9 20.8 33 19.9 85.9 0.408]
bankruptcy
## # A tibble: 46 × 5
## cftd nita cacl cans pop
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.45 -0.41 1.09 0.45 0
## 2 -0.56 -0.31 1.51 0.16 0
## 3 0.06 0.02 1.01 0.4 0
## 4 -0.07 -0.09 1.45 0.26 0
## 5 -0.1 -0.09 1.56 0.67 0
## 6 -0.14 -0.07 0.71 0.28 0
## 7 0.04 0.01 1.5 0.71 0
## 8 -0.07 -0.06 1.37 0.4 0
## 9 0.07 -0.01 1.37 0.34 0
## 10 -0.14 -0.14 1.42 0.43 0
## # … with 36 more rows
avai_id = which(!is.na(bankruptcy$nita))
bankruptcy1 = bankruptcy[avai_id, ]
pop1vals = sample.int(length(bankruptcy$pop[bankruptcy1$pop == 0]), 11)
pop2vals = sample.int(length(bankruptcy$pop[bankruptcy1$pop == 1]), 15)
pop1 = bankruptcy1[bankruptcy1$pop == 0, ]
pop2 = bankruptcy1[bankruptcy1$pop == 1, ]
samplepop1 = pop1[pop1vals, ]
samplepop2 = pop2[pop2vals, ]
samplepop = rbind(samplepop1, samplepop2)
samplepop1
## # A tibble: 11 × 5
## cftd nita cacl cans pop
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.1 -0.09 1.56 0.67 0
## 2 -0.07 -0.09 1.45 0.26 0
## 3 0.06 0.02 1.01 0.4 0
## 4 0.37 0.11 1.99 0.38 0
## 5 -0.56 -0.31 1.51 0.16 0
## 6 0.12 0.11 1.14 0.17 0
## 7 0.07 0.02 1.31 0.25 0
## 8 0.15 0.05 1.88 0.27 0
## 9 -0.28 -0.27 1.27 0.51 0
## 10 -0.14 -0.14 1.42 0.43 0
## 11 0.01 0 2.15 0.7 0
samplepop2
## # A tibble: 15 × 5
## cftd nita cacl cans pop
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.54 0.11 2.33 0.48 1
## 2 0.29 0.06 1.84 0.38 1
## 3 0.15 0.05 2.17 0.55 1
## 4 0.14 0.07 2.61 0.52 1
## 5 0.31 0.05 4.45 0.69 1
## 6 0.56 0.11 4.29 0.44 1
## 7 0.47 0.14 2.92 0.45 1
## 8 0.32 0.07 4.24 0.63 1
## 9 0.17 0.07 1.8 0.52 1
## 10 0.2 0.08 1.99 0.3 1
## 11 0.12 0.05 2.52 0.69 1
## 12 0.15 0.06 2.23 0.56 1
## 13 0.14 -0.03 0.46 0.26 1
## 14 0.51 0.1 2.49 0.54 1
## 15 0.08 0.02 2.01 0.53 1
samplepop
## # A tibble: 26 × 5
## cftd nita cacl cans pop
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.1 -0.09 1.56 0.67 0
## 2 -0.07 -0.09 1.45 0.26 0
## 3 0.06 0.02 1.01 0.4 0
## 4 0.37 0.11 1.99 0.38 0
## 5 -0.56 -0.31 1.51 0.16 0
## 6 0.12 0.11 1.14 0.17 0
## 7 0.07 0.02 1.31 0.25 0
## 8 0.15 0.05 1.88 0.27 0
## 9 -0.28 -0.27 1.27 0.51 0
## 10 -0.14 -0.14 1.42 0.43 0
## # … with 16 more rows
meanspop1 = colMeans(samplepop1)
meanspop2 = colMeans(samplepop2)
meanspop1
## cftd nita cacl cans pop
## -0.03363636 -0.05363636 1.51727273 0.38181818 0.00000000
meanspop2
## cftd nita cacl cans pop
## 0.27666667 0.06733333 2.55666667 0.50266667 1.00000000
meanssample = colMeans(samplepop)
meanssample
## cftd nita cacl cans pop
## 0.14538462 0.01615385 2.11692308 0.45153846 0.57692308
correctpop1vals = samplepop1 - meanssample
correctpop1vals
## cftd nita cacl cans pop
## 1 -0.24538462 -0.10615385 -0.55692308 0.2184615 -0.57692308
## 2 -0.08615385 -2.20692308 0.99846154 -0.3169231 -0.14538462
## 3 -2.05692308 -0.43153846 0.43307692 0.2546154 -0.01615385
## 4 -0.08153846 -0.46692308 1.84461538 0.3638462 -2.11692308
## 5 -1.13692308 -0.45538462 1.49384615 -1.9569231 -0.45153846
## 6 -0.02538462 0.09384615 -0.97692308 -0.2815385 -0.57692308
## 7 0.05384615 -2.09692308 0.85846154 -0.3269231 -0.14538462
## 8 -1.96692308 -0.40153846 1.30307692 0.1246154 -0.01615385
## 9 -0.73153846 -0.84692308 1.12461538 0.4938462 -2.11692308
## 10 -0.71692308 -0.28538462 1.40384615 -1.6869231 -0.45153846
## 11 -0.13538462 -0.01615385 0.03307692 0.2484615 -0.57692308
correctpop2vals = samplepop2 - meanssample
correctpop2vals
## cftd nita cacl cans pop
## 1 0.39461538 -0.03538462 2.18461538 0.33461538 0.8546154
## 2 0.27384615 0.04384615 1.82384615 0.36384615 0.9838462
## 3 -1.96692308 -2.06692308 0.05307692 -1.56692308 -1.1169231
## 4 -0.31153846 -0.38153846 2.15846154 0.06846154 0.5484615
## 5 -0.26692308 -0.52692308 3.87307692 0.11307692 0.4230769
## 6 0.41461538 -0.03538462 4.14461538 0.29461538 0.8546154
## 7 0.45384615 0.12384615 2.90384615 0.43384615 0.9838462
## 8 -1.79692308 -2.04692308 2.12307692 -1.48692308 -1.1169231
## 9 -0.28153846 -0.38153846 1.34846154 0.06846154 0.5484615
## 10 -0.37692308 -0.49692308 1.41307692 -0.27692308 0.4230769
## 11 -0.02538462 -0.09538462 2.37461538 0.54461538 0.8546154
## 12 0.13384615 0.04384615 2.21384615 0.54384615 0.9838462
## 13 -1.97692308 -2.14692308 -1.65692308 -1.85692308 -1.1169231
## 14 0.05846154 -0.35153846 2.03846154 0.08846154 0.5484615
## 15 -0.49692308 -0.55692308 1.43307692 -0.04692308 0.4230769
correctsamplevals = samplepop - meanssample
correctsamplevals
## cftd nita cacl cans pop
## 1 -0.24538462 -0.106153846 -0.55692308 0.21846154 -0.57692308
## 2 -0.08615385 -2.206923077 0.99846154 -0.31692308 -0.14538462
## 3 -2.05692308 -0.431538462 0.43307692 0.25461538 -0.01615385
## 4 -0.08153846 -0.466923077 1.84461538 0.36384615 -2.11692308
## 5 -1.13692308 -0.455384615 1.49384615 -1.95692308 -0.45153846
## 6 -0.02538462 0.093846154 -0.97692308 -0.28153846 -0.57692308
## 7 0.05384615 -2.096923077 0.85846154 -0.32692308 -0.14538462
## 8 -1.96692308 -0.401538462 1.30307692 0.12461538 -0.01615385
## 9 -0.73153846 -0.846923077 1.12461538 0.49384615 -2.11692308
## 10 -0.71692308 -0.285384615 1.40384615 -1.68692308 -0.45153846
## 11 -0.13538462 -0.016153846 0.03307692 0.24846154 -0.57692308
## 12 0.52384615 -2.006923077 1.87846154 -0.09692308 0.85461538
## 13 -1.82692308 -0.391538462 1.26307692 0.23461538 0.98384615
## 14 -0.30153846 -0.526923077 2.02461538 0.53384615 -1.11692308
## 15 -0.43692308 -0.075384615 2.59384615 -1.59692308 0.54846154
## 16 0.16461538 0.033846154 2.33307692 0.23846154 0.42307692
## 17 0.54384615 -2.006923077 3.83846154 -0.13692308 0.85461538
## 18 -1.64692308 -0.311538462 2.34307692 0.30461538 0.98384615
## 19 -0.13153846 -0.506923077 4.09461538 0.61384615 -1.11692308
## 20 -0.40692308 -0.075384615 1.78384615 -1.59692308 0.54846154
## 21 0.05461538 0.063846154 -0.12692308 -0.15153846 0.42307692
## 22 0.10384615 -2.066923077 2.06846154 0.11307692 0.85461538
## 23 -1.96692308 -0.391538462 1.65307692 0.41461538 0.98384615
## 24 -0.31153846 -0.606923077 0.31461538 0.24384615 -1.11692308
## 25 -0.06692308 -0.045384615 2.47384615 -1.57692308 0.54846154
## 26 -0.06538462 0.003846154 -0.10692308 0.07846154 0.42307692
cov(correctpop1vals)
## cftd nita cacl cans pop
## cftd 0.59170584 -0.151888913 -0.1856146 0.034700958 -0.17981017
## nita -0.15188891 0.612056105 -0.2417393 0.007512017 -0.09483719
## cacl -0.18561457 -0.241739295 0.7995777 -0.215452458 -0.18232386
## cans 0.03470096 0.007512017 -0.2154525 0.678152222 -0.20629253
## pop -0.17981017 -0.094837187 -0.1823239 -0.206292528 0.57111655
cov(correctpop2vals)
## cftd nita cacl cans pop
## cftd 0.7180692 0.6727220 0.8368467 0.6557682 0.6505520
## nita 0.6727220 0.6470101 0.7486209 0.6353453 0.6282953
## cacl 0.8368467 0.7486209 1.9505099 0.7943957 0.7103938
## cans 0.6557682 0.6353453 0.7943957 0.6397930 0.6177582
## pop 0.6505520 0.6282953 0.7103938 0.6177582 0.6112157
cov(correctsamplevals)
## cftd nita cacl cans pop
## cftd 0.60950310 -0.18846646 0.08081515 -0.04387453 -0.08558987
## nita -0.18846646 0.58018789 -0.25565593 -0.07367497 -0.08177217
## cacl 0.08081515 -0.25565593 1.50603442 -0.09270030 0.20866370
## cans -0.04387453 -0.07367497 -0.09270030 0.60897230 -0.15525956
## pop -0.08558987 -0.08177217 0.20866370 -0.15525956 0.82069033
solve(cov(correctpop1vals))
## cftd nita cacl cans pop
## cftd 4.408537 2.887534 3.203368 1.841689 3.555356
## nita 2.887534 4.053395 3.106021 1.771759 3.213745
## cacl 3.203368 3.106021 4.436073 2.365448 3.794919
## cans 1.841689 1.771759 2.365448 2.929763 2.687453
## pop 3.555356 3.213745 3.794919 2.687453 5.586211
solve(cov(correctpop2vals))
## cftd nita cacl cans pop
## cftd 105.514680 -275.20844 -6.454224 49.72244 127.84026
## nita -275.208442 2009.02319 -12.172110 -13.19278 -1744.76116
## cacl -6.454224 -12.17211 2.386188 -12.29536 29.03544
## cans 49.722442 -13.19278 -12.295363 131.65972 -158.13956
## pop 127.840262 -1744.76116 29.035437 -158.13956 1785.16985
solve(cov(correctsamplevals))
## cftd nita cacl cans pop
## cftd 1.9299985 0.7083343 -0.0104728 0.3079973 0.3327871
## nita 0.7083343 2.1922036 0.3204672 0.4400061 0.2940607
## cacl -0.0104728 0.3204672 0.7450532 0.1166224 -0.1365313
## cans 0.3079973 0.4400061 0.1166224 1.8356310 0.3935786
## pop 0.3327871 0.2940607 -0.1365313 0.3935786 1.3916639
confusionMatrix(as.matrix(solve(cov(correctpop1vals))))
## Confusion Matrix and Statistics
##
## cftd nita cacl cans pop
## cftd 4.408537 2.887534 3.203368 1.841689 3.555356
## nita 2.887534 4.053395 3.106021 1.771759 3.213745
## cacl 3.203368 3.106021 4.436073 2.365448 3.794919
## cans 1.841689 1.771759 2.365448 2.929763 2.687453
## pop 3.555356 3.213745 3.794919 2.687453 5.586211
##
## Overall Statistics
##
## Accuracy : 0.2736
## 95% CI : (NA, NA)
## No Information Rate : NA
## P-Value [Acc > NIR] : NA
##
## Kappa : 0.0867
##
## Mcnemar's Test P-Value : 1
##
## Statistics by Class:
##
## Class: cftd Class: nita Class: cacl Class: cans Class: pop
## Sensitivity 0.27733 0.26964 0.26240 0.25265 0.29654
## Specificity 0.81582 0.82638 0.79679 0.87002 0.77703
## Pos Pred Value 0.27733 0.26964 0.26240 0.25265 0.29654
## Neg Pred Value 0.81582 0.82638 0.79679 0.87002 0.77703
## Prevalence 0.20310 0.19206 0.21600 0.14816 0.24068
## Detection Rate 0.05633 0.05179 0.05668 0.03743 0.07137
## Detection Prevalence 0.20310 0.19206 0.21600 0.14816 0.24068
## Balanced Accuracy 0.54657 0.54801 0.52959 0.56133 0.53679